# import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# load data set
chicago_crime_df =  pd.read_csv(r"C:\Users\jki\Downloads\Chicago_Crime_Data-v2.csv")
chicago_crime_df.head(5)


# lets confirm existance of missing values
missing_values = chicago_crime_df.isna().sum()
print(missing_values)

ID                        0
CASE_NUMBER               0
DATE                      0
BLOCK                     0
IUCR                      0
PRIMARY_TYPE              0
DESCRIPTION               0
LOCATION_DESCRIPTION      0
ARREST                    0
DOMESTIC                  0
BEAT                      0
DISTRICT                  0
WARD                     43
COMMUNITY_AREA_NUMBER    43
FBICODE                   0
X_COORDINATE              4
Y_COORDINATE              4
YEAR                      0
UPDATEDON                 0
LATITUDE                  4
LONGITUDE                 4
LOCATION                  4
dtype: int64


# lets remove missing values
chicago_crime_df.dropna(subset=['WARD','COMMUNITY_AREA_NUMBER','X_COORDINATE','Y_COORDINATE','LATITUDE','LONGITUDE','LOCATION'],inplace =True)
# lets confirm existance of missing values
missing_values = chicago_crime_df.isna().sum()
print(missing_values)

ID                       0
CASE_NUMBER              0
DATE                     0
BLOCK                    0
IUCR                     0
PRIMARY_TYPE             0
DESCRIPTION              0
LOCATION_DESCRIPTION     0
ARREST                   0
DOMESTIC                 0
BEAT                     0
DISTRICT                 0
WARD                     0
COMMUNITY_AREA_NUMBER    0
FBICODE                  0
X_COORDINATE             0
Y_COORDINATE             0
YEAR                     0
UPDATEDON                0
LATITUDE                 0
LONGITUDE                0
LOCATION                 0
dtype: int64


# do we have unncessary negative values ?

chicago_crime_df.describe()


# let have a look at the data types
chicago_crime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 487 entries, 0 to 531
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     487 non-null    int64  
 1   CASE_NUMBER            487 non-null    object 
 2   DATE                   487 non-null    object 
 3   BLOCK                  487 non-null    object 
 4   IUCR                   487 non-null    object 
 5   PRIMARY_TYPE           487 non-null    object 
 6   DESCRIPTION            487 non-null    object 
 7   LOCATION_DESCRIPTION   487 non-null    object 
 8   ARREST                 487 non-null    bool   
 9   DOMESTIC               487 non-null    bool   
 10  BEAT                   487 non-null    int64  
 11  DISTRICT               487 non-null    int64  
 12  WARD                   487 non-null    float64
 13  COMMUNITY_AREA_NUMBER  487 non-null    float64
 14  FBICODE                487 non-null    object 
 15  X_COORDINATE           487 non-null    float64
 16  Y_COORDINATE           487 non-null    float64
 17  YEAR                   487 non-null    int64  
 18  UPDATEDON              487 non-null    object 
 19  LATITUDE               487 non-null    float64
 20  LONGITUDE              487 non-null    float64
 21  LOCATION               487 non-null    object 
dtypes: bool(2), float64(6), int64(4), object(10)
memory usage: 80.8+ KB


# let change the date data type
chicago_crime_df['YEAR'] = pd.to_datetime(chicago_crime_df['YEAR'])
chicago_crime_df['YEAR'].info()

<class 'pandas.core.series.Series'>
Int64Index: 487 entries, 0 to 531
Series name: YEAR
Non-Null Count  Dtype         
--------------  -----         
487 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 7.6 KB


print(chicago_crime_df.head(5))

         ID CASE_NUMBER                    DATE                     BLOCK  \
0   3512276    HK587712  08/28/2004 05:50:56 PM        047XX S KEDZIE AVE   
1   3406613    HK456306  06/26/2004 12:40:00 PM  009XX N CENTRAL PARK AVE   
2   8002131    HT233595  04/04/2011 05:45:00 AM        043XX S WABASH AVE   
3   7903289    HT133522  12/30/2010 04:30:00 PM      083XX S KINGSTON AVE   
4  10402076    HZ138551  02/02/2016 07:30:00 PM           033XX W 66TH ST   

  IUCR PRIMARY_TYPE                    DESCRIPTION  \
0  890        THEFT                  FROM BUILDING   
1  820        THEFT                 $500 AND UNDER   
2  820        THEFT                 $500 AND UNDER   
3  840        THEFT  FINANCIAL ID THEFT: OVER $300   
4  820        THEFT                 $500 AND UNDER   

           LOCATION_DESCRIPTION  ARREST  DOMESTIC  ...  WARD  \
0            SMALL RETAIL STORE   False     False  ...  14.0   
1                         OTHER   False     False  ...  27.0   
2  NURSING HOME/RETIREMENT HOME   False     False  ...   3.0   
3                     RESIDENCE   False     False  ...   7.0   
4                         ALLEY   False     False  ...  15.0   

   COMMUNITY_AREA_NUMBER  FBICODE  X_COORDINATE Y_COORDINATE  \
0                   58.0        6     1155838.0    1873050.0   
1                   23.0        6     1152206.0    1906127.0   
2                   38.0        6     1177436.0    1876313.0   
3                   46.0        6     1194622.0    1850125.0   
4                   66.0        6     1155240.0    1860661.0   

                           YEAR               UPDATEDON   LATITUDE  LONGITUDE  \
0 1970-01-01 00:00:00.000002004  02/10/2018 03:50:01 PM  41.807440 -87.703956   
1 1970-01-01 00:00:00.000002004  02/28/2018 03:56:25 PM  41.898280 -87.716406   
2 1970-01-01 00:00:00.000002011  02/10/2018 03:50:01 PM  41.815933 -87.624642   
3 1970-01-01 00:00:00.000002010  02/10/2018 03:50:01 PM  41.743665 -87.562463   
4 1970-01-01 00:00:00.000002016  02/10/2018 03:50:01 PM  41.773455 -87.706480   

                        LOCATION  
0    (41.8074405, -87.703955849)  
1  (41.898279962, -87.716405505)  
2  (41.815933131, -87.624642127)  
3  (41.743665322, -87.562462756)  
4  (41.773455295, -87.706480471)  

[5 rows x 22 columns]


# Find the minimum date
oldest_date = chicago_crime_df['DATE'].min()
print("Oldest data point in the dataset:", oldest_date)

Oldest data point in the dataset: 01/01/2010 01:22:08 AM


import pandas as pd

# Assuming 'chicago_crime_df' is your DataFrame containing the dataset
# Convert the 'DATE' column to datetime format
chicago_crime_df['DATE'] = pd.to_datetime(chicago_crime_df['DATE'], format='%m/%d/%Y %I:%M:%S %p')

# Extract the year from the 'DATE' column
chicago_crime_df['YEAR'] = chicago_crime_df['DATE'].dt.year

# Group by year and count the number of crimes in each year
crimes_by_year = chicago_crime_df.groupby('YEAR').size()

# Find the year with the largest number of crimes
max_crimes_year = crimes_by_year.idxmax()
max_crimes_count = crimes_by_year.max()

print("Year with the largest amount of crimes:", max_crimes_year)
print("Number of crimes committed that year:", max_crimes_count)

Year with the largest amount of crimes: 2005
Number of crimes committed that year: 44


## Assuming 'chicago_crime_df' is your DataFrame containing the dataset
# Filter the dataset for the year 2020
crimes_2020 = chicago_crime_df[chicago_crime_df['YEAR'] == 2020]

# Check if there are any crimes recorded for the year 2020
if not crimes_2020.empty:
    # Count the occurrences of each crime type in 2020
    crime_counts_2020 = crimes_2020['PRIMARY_TYPE'].value_counts()

    # Find the five most common crimes in 2020
    top_5_crimes_2020 = crime_counts_2020.head(5)

    print("Five most common crimes in 2020:")
    print(top_5_crimes_2020)

    # Calculate the arrest rate for each crime type
    arrest_rates_2020 = {}
    for crime_type in top_5_crimes_2020.index:
        total_crimes = crime_counts_2020[crime_type]
        arrests = crimes_2020[crimes_2020['PRIMARY_TYPE'] == crime_type]['ARREST'].sum()
        arrest_rate = arrests / total_crimes
        arrest_rates_2020[crime_type] = arrest_rate

    # Find the crime with the highest and lowest arrest rates among the top 5
    if arrest_rates_2020:
        highest_arrest_rate_crime = max(arrest_rates_2020, key=arrest_rates_2020.get)
        lowest_arrest_rate_crime = min(arrest_rates_2020, key=arrest_rates_2020.get)

        print("\nCrime with the highest arrest rate among the top 5:", highest_arrest_rate_crime)
        print("Arrest rate:", arrest_rates_2020[highest_arrest_rate_crime])

        print("\nCrime with the lowest arrest rate among the top 5:", lowest_arrest_rate_crime)
        print("Arrest rate:", arrest_rates_2020[lowest_arrest_rate_crime])
    else:
        print("No crimes recorded for the year 2020.")
else:
    print("No crimes recorded for the year 2020.")

No crimes recorded for the year 2020.


import matplotlib.pyplot as plt

# Calculate the total number of crimes and total number of arrests per year
crimes_by_year = chicago_crime_df.groupby('YEAR').size()
arrests_by_year = chicago_crime_df.groupby('YEAR')['ARREST'].sum()

# Calculate arrest rates per year
arrest_rates_by_year = arrests_by_year / crimes_by_year

# Find the year with the highest arrest rate
year_highest_arrest_rate = arrest_rates_by_year.idxmax()
highest_arrest_rate = arrest_rates_by_year.max()

print("Year with the highest arrest rate:", year_highest_arrest_rate)
print("Highest arrest rate:", highest_arrest_rate)

# Plot the number of crimes per year
plt.figure(figsize=(10, 6))
plt.plot(crimes_by_year.index, crimes_by_year.values, marker='o', linestyle='-')
plt.title('Number of Crimes per Year')
plt.xlabel('Year')
plt.ylabel('Number of Crimes')
plt.grid(True)
plt.show()

Year with the highest arrest rate: 2001
Highest arrest rate: 1.0


import matplotlib.pyplot as plt

# Calculate the total number of arrests per year
arrests_by_year = chicago_crime_df.groupby('YEAR')['ARREST'].sum()

# Find the year with the most arrests
year_most_arrests = arrests_by_year.idxmax()
most_arrests = arrests_by_year.max()

print("Year with the most arrests:", year_most_arrests)
print("Number of arrests made during that year:", most_arrests)

# Plot the trend for total number of arrests per year
plt.figure(figsize=(10, 6))
plt.plot(arrests_by_year.index, arrests_by_year.values, marker='o', linestyle='-')
plt.title('Total Number of Arrests per Year')
plt.xlabel('Year')
plt.ylabel('Total Number of Arrests')
plt.grid(True)
plt.show()

Year with the most arrests: 2005
Number of arrests made during that year: 18


import matplotlib.pyplot as plt

# Calculate the total number of crimes and total number of arrests per year
crimes_by_year = chicago_crime_df.groupby('YEAR').size()
arrests_by_year = chicago_crime_df.groupby('YEAR')['ARREST'].sum()

# Calculate arrest rates per year
arrest_rates_by_year = arrests_by_year / crimes_by_year

# Plot the trend for arrest rate over time
plt.figure(figsize=(10, 6))
plt.plot(arrest_rates_by_year.index, arrest_rates_by_year.values, marker='o', linestyle='-')
plt.title('Arrest Rate Over Time')
plt.xlabel('Year')
plt.ylabel('Arrest Rate')
plt.grid(True)
plt.show()

# Find the years with the biggest change in arrest rate
max_change = 0
year_start_max_change = None
year_end_max_change = None
for i in range(1, len(arrest_rates_by_year)):
    change = abs(arrest_rates_by_year.iloc[i] - arrest_rates_by_year.iloc[i - 1])
    if change > max_change:
        max_change = change
        year_start_max_change = arrest_rates_by_year.index[i - 1]
        year_end_max_change = arrest_rates_by_year.index[i]

print("Between which years can you see the biggest change in Arrest Rate?")
print("Between {} and {}".format(year_start_max_change, year_end_max_change))

# Analysis and comments on the conclusions
# Please note that specific reasons for changes in arrest rate may vary and require further investigation,
# such as changes in policing strategies, crime reporting methods, community relations, etc.
print("\nPossible reasons for the drop in arrest rate between {} and {}:".format(year_start_max_change, year_end_max_change))
print("- Changes in law enforcement policies or priorities")
print("- Changes in community relations or trust in law enforcement")
print("- Changes in crime reporting methods or data collection processes")
print("- Socioeconomic factors impacting crime rates and law enforcement effectiveness")
# Add more specific reasons as per the context of the dataset and external factors influencing crime and law enforcement.

Between which years can you see the biggest change in Arrest Rate?
Between 2001 and 2002

Possible reasons for the drop in arrest rate between 2001 and 2002:
- Changes in law enforcement policies or priorities
- Changes in community relations or trust in law enforcement
- Changes in crime reporting methods or data collection processes
- Socioeconomic factors impacting crime rates and law enforcement effectiveness

	ID	BEAT	DISTRICT	WARD	COMMUNITY_AREA_NUMBER	X_COORDINATE	Y_COORDINATE	YEAR	LATITUDE	LONGITUDE
count	4.870000e+02	487.000000	487.00000	487.000000	487.000000	4.870000e+02	4.870000e+02	487.000000	487.000000	487.000000
mean	6.622419e+06	1194.279261	11.36345	22.624230	37.595483	1.162707e+06	1.886352e+06	2008.969199	41.843787	-87.678434
std	2.828354e+06	665.280757	6.56601	13.088511	21.457648	1.637996e+04	3.036550e+04	4.668989	0.083528	0.059614
min	2.114900e+04	111.000000	1.00000	1.000000	1.000000	1.100658e+06	1.814512e+06	2001.000000	41.645796	-87.905227
25%	3.978664e+06	711.000000	6.00000	12.000000	23.000000	1.151282e+06	1.860430e+06	2005.000000	41.772486	-87.720007
50%	6.780581e+06	1111.000000	11.00000	24.000000	30.000000	1.162315e+06	1.891618e+06	2009.000000	41.858444	-87.679976
75%	9.123182e+06	1652.500000	16.00000	32.000000	58.000000	1.174478e+06	1.908020e+06	2013.000000	41.903339	-87.635440
max	1.127717e+07	2535.000000	25.00000	50.000000	77.000000	1.204126e+06	1.951001e+06	2018.000000	42.021178	-87.528223

Return Home

Return to Python Projects Page

Chicago Crime Data Exploratory Data Analysis¶

Task1 :From what date is the oldest data point in the data set?¶

Task 2:Which year had the largest amount of crimes and how many crimes were committed that year?¶

Task 4:What year had the highest arrest rate? Plot the number of crimes per year and comment on the trend.¶

Task 5:Which year had the most number of crimes leading to an arrest? How many arrests were made during that year? Plot the trend for total number of arrests per year.¶

Task 6:How has the arrest rate looked like over time?¶

Task 1¶

From what date is the oldest data point in the data set¶

Task 2¶

Which year had the largest amount of crimes and how many crimes were committed that year?¶

Task 3¶

Task 4¶

What year had the highest arrest rate? Plot the number of crimes per year and comment on the trend.¶

Task 5¶

Which year had the most number of crimes leading to an arrest? How many arrests were made during that year? Plot the trend for total number of arrests per year.¶

Task 6¶

How has the arrest rate looked like over time?¶

Plot the trend of the arrest rate.¶

Between which years can you see the biggest change in "Arrest Rate"?¶

Can you point at specific reasons to why the Arrest Rate dropped between those years? Comment on your conclusions.¶

	ID	CASE_NUMBER	DATE	BLOCK	IUCR	PRIMARY_TYPE	DESCRIPTION	LOCATION_DESCRIPTION	ARREST	DOMESTIC	...	WARD	COMMUNITY_AREA_NUMBER	FBICODE	X_COORDINATE	Y_COORDINATE	YEAR	UPDATEDON	LATITUDE	LONGITUDE	LOCATION
0	3512276	HK587712	08/28/2004 05:50:56 PM	047XX S KEDZIE AVE	890	THEFT	FROM BUILDING	SMALL RETAIL STORE	False	False	...	14.0	58.0	6	1155838.0	1873050.0	2004	02/10/2018 03:50:01 PM	41.807440	-87.703956	(41.8074405, -87.703955849)
1	3406613	HK456306	06/26/2004 12:40:00 PM	009XX N CENTRAL PARK AVE	820	THEFT	$500 AND UNDER	OTHER	False	False	...	27.0	23.0	6	1152206.0	1906127.0	2004	02/28/2018 03:56:25 PM	41.898280	-87.716406	(41.898279962, -87.716405505)
2	8002131	HT233595	04/04/2011 05:45:00 AM	043XX S WABASH AVE	820	THEFT	$500 AND UNDER	NURSING HOME/RETIREMENT HOME	False	False	...	3.0	38.0	6	1177436.0	1876313.0	2011	02/10/2018 03:50:01 PM	41.815933	-87.624642	(41.815933131, -87.624642127)
3	7903289	HT133522	12/30/2010 04:30:00 PM	083XX S KINGSTON AVE	840	THEFT	FINANCIAL ID THEFT: OVER $300	RESIDENCE	False	False	...	7.0	46.0	6	1194622.0	1850125.0	2010	02/10/2018 03:50:01 PM	41.743665	-87.562463	(41.743665322, -87.562462756)
4	10402076	HZ138551	02/02/2016 07:30:00 PM	033XX W 66TH ST	820	THEFT	$500 AND UNDER	ALLEY	False	False	...	15.0	66.0	6	1155240.0	1860661.0	2016	02/10/2018 03:50:01 PM	41.773455	-87.706480	(41.773455295, -87.706480471)

Return Home

Return to Python Projects Page

Chicago Crime Data Exploratory Data Analysis¶

Task1 :From what date is the oldest data point in the data set?¶

Task 2:Which year had the largest amount of crimes and how many crimes were committed that year?¶

Task 3:Let's define "Arrest Rate" as the share of crimes that led to an arrest. What were the five most common crimes in 2020? Which of those has the highest and lowest arrest rate?¶

Task 4:What year had the highest arrest rate? Plot the number of crimes per year and comment on the trend.¶

Task 5:Which year had the most number of crimes leading to an arrest? How many arrests were made during that year? Plot the trend for total number of arrests per year.¶

Task 6:How has the arrest rate looked like over time?¶

Task 1¶

From what date is the oldest data point in the data set¶

Task 2¶

Which year had the largest amount of crimes and how many crimes were committed that year?¶

Task 3¶

Let's define "Arrest Rate" as the share of crimes that led to an arrest. What were the five most common crimes in 2020? Which of those has the highest and lowest arrest rate?¶

Task 4¶

What year had the highest arrest rate? Plot the number of crimes per year and comment on the trend.¶

Task 5¶

Which year had the most number of crimes leading to an arrest? How many arrests were made during that year? Plot the trend for total number of arrests per year.¶

Task 6¶

How has the arrest rate looked like over time?¶

Plot the trend of the arrest rate.¶

Between which years can you see the biggest change in "Arrest Rate"?¶

Can you point at specific reasons to why the Arrest Rate dropped between those years? Comment on your conclusions.¶